EDA

Since we know what the data is, let’s try a 3d plot… colored by gender

p <- plotly::plot_ly(data = measure, x = ~chest, y = ~waist, z = ~hips , type = 'scatter3d',mode = 'markers',color = ~gender)
# Sys.setenv("plotly_username"="srivathsesh")
# Sys.setenv("plotly_api_key"="0Sr49roqMwTtuQl3JZ9G")
# api_create(p, filename = "r-clusterAnalysis")
# knitr::include_graphics('https://plot.ly/~srivathsesh/5.embed')
p

Great, now what??

Let’s see if we can create the visual clustering mathematically…

Lets start with the eucledian distance. Lets manually create the euclidean distance between first row and the second row \[\sqrt{(measure[1,1] - measure[2,1])^2 + ( measure[1,2] - measure[2,2])^2 + (measure[1,3] - measure[2,3])^2}\] 6.164414

# create a matrix of the euclidean distances
dm <- dist(measure[1:3], method = "euclidean" )
single <- hclust(dm,method = "single")
plot(single,main = "Single")

Complete <- hclust(dm,method = "complete")
plot(Complete,main =  "Complete")

Average <- hclust(dm,method = "average")
#plot(as.dendrogram(Average),main = "Average",horiz = T)
plot(Average,main = "Average")

How do they compare to the principle components ?

pc <- princomp(measure[,1:3],cor = T)
summary(pc)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3
## Standard deviation     1.4391057 0.7952666 0.54454193
## Proportion of Variance 0.6903417 0.2108163 0.09884197
## Cumulative Proportion  0.6903417 0.9011580 1.00000000
pc2 <- princomp(dm, cor = T)
summary(pc2)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4
## Standard deviation     3.4600702 2.1000928 1.4984859 0.80761233
## Proportion of Variance 0.5986043 0.2205195 0.1122730 0.03261188
## Cumulative Proportion  0.5986043 0.8191238 0.9313968 0.96400865
##                             Comp.5      Comp.6      Comp.7      Comp.8
## Standard deviation     0.440832659 0.403036145 0.302905493 0.269054424
## Proportion of Variance 0.009716672 0.008121907 0.004587587 0.003619514
## Cumulative Proportion  0.973725322 0.981847229 0.986434816 0.990054330
##                             Comp.9     Comp.10     Comp.11     Comp.12
## Standard deviation     0.213776213 0.210015148 0.163842236 0.146733170
## Proportion of Variance 0.002285013 0.002205318 0.001342214 0.001076531
## Cumulative Proportion  0.992339344 0.994544662 0.995886876 0.996963407
##                             Comp.13      Comp.14      Comp.15      Comp.16
## Standard deviation     0.1332257792 0.1138641186 0.0982936332 0.0831398915
## Proportion of Variance 0.0008874554 0.0006482519 0.0004830819 0.0003456121
## Cumulative Proportion  0.9978508622 0.9984991140 0.9989821959 0.9993278080
##                             Comp.17      Comp.18      Comp.19 Comp.20
## Standard deviation     0.0755832855 0.0629578455 0.0613784668       0
## Proportion of Variance 0.0002856417 0.0001981845 0.0001883658       0
## Cumulative Proportion  0.9996134497 0.9998116342 1.0000000000       1
measure$pc1 <- pc$scores[,1]
measure$pc2 <- pc$scores[,2]
measure$dmpc1 <- pc2$scores[,1]
measure$dmpc2 <- pc2$scores[,2]
measure$single <- cutree(single,h = 3.6)
measure$complete <- cutree(Complete, h = 10)
measure$avg <- cutree(Average, h = 7.9)
plot_ly(data = measure,x = ~pc1, y = ~pc2, color = ~gender, type = 'scatter', mode = 'markers',colors = "Set1") %>% 
  add_text(text = measure$avg, textposition = "top right",showlegend = F) %>% 
  layout(title = 'PCA using raw data and class labels based on Average')
plot_ly(data = measure,x = ~dmpc1, y = ~dmpc2, color = ~gender, type = 'scatter', mode = 'markers',colors = "Set1") %>% 
  add_text(text = measure$avg, textposition = "top right",showlegend = F) %>% 
  layout(title = 'PCA using Eucleadian distances and class labels based on Average')

## K-Means Clustering - Textbook example

crime <- readRDS(file = 'crime.rds')

symbol <- rep(20,51)
symbol[which(crime$Murder > 15)] <- 3
pairs(crime,pch = symbol)

The “+” symbol indicates the record for MD… did you see that its an outlier. Lets leave that out data point

crime_subset <- dplyr::filter(crime, rownames(crime) != 'DC')
rownames(crime_subset) <- rownames(crime[which(rownames(crime) != 'DC'),])
sapply(crime_subset,'var')
##       Murder         Rape      Robbery      Assault     Burglary 
##     11.93492    209.76335  11889.56122  19373.53510 175895.00449 
##        Theft      Vehicle 
## 565276.55878  43997.35878

See the variance is different for the different variables? It necessary standardize the variables… Not necessarily scale it. The scaling doesn’t matter as we are interested in the distances within each axis (variable… see the euclidean formula above.)

# Get the ranges for each column
rge <- sapply(crime_subset, function(x) diff(range(x)))
crime_s <- sweep(crime_subset,2,rge,FUN = "/")
rownames(crime_s) <- rownames(crime_subset)
n <- nrow(crime_s)
OverallSS <- (n-1) * sum(sapply(crime_s,var))

kmeanscree <- function(data,center) {
  withinss <- purrr::map(.x = center, .f = function(x) kmeans(data,center =x)$tot.withinss)
  withinss <- c((nrow(data) - 1) * sum(sapply(data,var)),withinss)
  plot(x = c(1,center), y = unlist(withinss),xlab = 'Number of clusters',ylab = 'Within cluster Sum of Squares',type = 'b' )
}
kmeanscree(data = crime_s,center = 2:6)

2 clusters form the elbow of the scree plot. We’ll use the 2 clusters for the kmean clustering.

kclust <- kmeans(x = crime_s, centers = 2)
# converting back to non standardized units
kclust$center*rge
##      Murder     Rape  Robbery   Assault  Burglary     Theft   Vehicle
## 1  9.368182 376.4971 829.2781 555.98881  51.36291  640.0962 2070.7959
## 2 23.165629 232.6096 438.9973   3.97189 255.21604 1561.7962  247.0357
# PCA
pca.crime <- princomp(x = crime_s,cor = F)
summary(pca.crime)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4
## Standard deviation     0.5162052 0.2384135 0.20256101 0.13721052
## Proportion of Variance 0.6477705 0.1381778 0.09974432 0.04576688
## Cumulative Proportion  0.6477705 0.7859483 0.88569258 0.93145946
##                            Comp.5     Comp.6     Comp.7
## Standard deviation     0.10914469 0.10332169 0.07487990
## Proportion of Variance 0.02895887 0.02595132 0.01363035
## Cumulative Proportion  0.96041833 0.98636965 1.00000000
# populate df for plotting
df.pca <- data.frame(pc1 = pca.crime$scores[,1], pc2 = pca.crime$scores[,2],clusters = as.factor(kclust$cluster),Index = rownames(crime_s))
ggplot(df.pca,mapping = aes(x = pc1, y = pc2, color = clusters,label = Index)) + geom_point() + geom_text(size = 3) + theme_bw()